HEAD ======= >>>>>>> 96a75bd8e0458a8ddb96ee845073289da15b496e
## Automatically generates rmarkdown output file
# Subset gene biotypes to be analyzed
subsetGenes="protein_coding"#NULL
# Specify the resolution of the unsupervised clustering algorithm
resolution=1.0
# Subset cells
if (getwd()!="/Users/schilder/Desktop/PD_scRNAseq"){
subsetCells=500
} else {subsetCells=NULL}
# params <- list(set_title=paste(sep="", "PDscRNAseq__",
# "Genes-",subsetGenes,"__Cells-",subsetCells,"__Resolution-",resolution,
# ".html"))
kableStyle = c("striped", "hover", "condensed", "responsive")
knitr::opts_chunk$set(echo=T, error=T, cache=T, cache.lazy=F)
# rmarkdown::render(input = "run_seurat.Rmd", output_file = params$set_title, output_format = "html_document")
nCores = parallel::detectCores()
print(paste("**** Utilized Cores **** =", parallel::detectCores() )) ## [1] "**** Utilized Cores **** = 4"
library(Seurat)
library(dplyr)
library(gridExtra)
library(knitr)
library(kableExtra)
## Install Bioconductor
# if (!requireNamespace("BiocManager"))
# install.packages("BiocManager")
# BiocManager::install(c("biomaRt"))
library(biomaRt)
sessionInfo()## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
=======
## Automatically generates rmarkdown output file
# Subset gene biotypes to be analyzed
subsetGenes="protein_coding"#NULL
resolution=1 #0.6
if (getwd()!="/Users/schilder/Desktop/PD_scRNAseq"){
subsetCells=500 # NULL
} else {subsetCells=NULL}
# Specify the resolution of the unsupervised clustering algorithm
# params <- list(set_title=paste(sep="", "PDscRNAseq__",
# "Genes-",subsetGenes,"__Cells-",subsetCells,"__Resolution-",resolution,
# ".html"))
kableStyle = c("striped", "hover", "condensed", "responsive")
knitr::opts_chunk$set(echo=T, error=T, cache=T, cache.lazy=F)
# rmarkdown::render(input = "run_seurat.Rmd", output_file = params$set_title, output_format = "html_document")
nCores = parallel::detectCores()
print(paste("**** Utilized Cores **** =", parallel::detectCores() ))
## [1] "**** Utilized Cores **** = 12"
Load Libraries & Report Versions
## Error in library(kableExtra): there is no package called 'kableExtra'
## Install Bioconductor
# if (!requireNamespace("BiocManager"))
# install.packages("BiocManager")
# BiocManager::install(c("biomaRt"))
library(biomaRt)
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS release 6.9 (Final)
##
## Matrix products: default
## BLAS/LAPACK: /hpc/packages/minerva-common/intel/parallel_studio_xe_2018/compilers_and_libraries_2018.1.163/linux/mkl/lib/intel64_lin/libmkl_gf_lp64.so
##
## locale:
## [1] C
>>>>>>> 96a75bd8e0458a8ddb96ee845073289da15b496e
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
<<<<<<< HEAD
## [1] kableExtra_0.9.0 knitr_1.21 readxl_1.2.0 bindrcpp_0.2.2
## [5] biomaRt_2.38.0 gridExtra_2.3 dplyr_0.7.8 Seurat_2.3.4
## [9] Matrix_1.2-15 cowplot_0.9.3 ggplot2_3.1.0
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.15 colorspace_1.3-2 class_7.3-14
## [4] modeltools_0.2-22 ggridges_0.5.1 mclust_5.4.2
## [7] htmlTable_1.12 base64enc_0.1-3 rstudioapi_0.8
## [10] proxy_0.4-22 npsurv_0.4-0 flexmix_2.3-14
## [13] bit64_0.9-7 AnnotationDbi_1.44.0 mvtnorm_1.0-8
## [16] xml2_1.2.0 codetools_0.2-16 splines_3.5.1
## [19] R.methodsS3_1.7.1 lsei_1.2-0 robustbase_0.93-3
## [22] jsonlite_1.6 Formula_1.2-3 ica_1.0-2
## [25] cluster_2.0.7-1 kernlab_0.9-27 png_0.1-7
## [28] R.oo_1.22.0 readr_1.3.1 compiler_3.5.1
## [31] httr_1.4.0 backports_1.1.3 assertthat_0.2.0
## [34] lazyeval_0.2.1 prettyunits_1.0.2 lars_1.2
## [37] acepack_1.4.1 htmltools_0.3.6 tools_3.5.1
## [40] igraph_1.2.2 gtable_0.2.0 glue_1.3.0
## [43] reshape2_1.4.3 RANN_2.6 Rcpp_1.0.0
## [46] Biobase_2.42.0 cellranger_1.1.0 trimcluster_0.1-2.1
## [49] gdata_2.18.0 ape_5.2 nlme_3.1-137
## [52] iterators_1.0.10 fpc_2.1-11.1 gbRd_0.4-11
## [55] lmtest_0.9-36 xfun_0.4 stringr_1.3.1
## [58] rvest_0.3.2 irlba_2.3.2 gtools_3.8.1
## [61] XML_3.98-1.16 DEoptimR_1.0-8 MASS_7.3-51.1
## [64] zoo_1.8-4 scales_1.0.0 hms_0.4.2
## [67] doSNOW_1.0.16 parallel_3.5.1 RColorBrewer_1.1-2
## [70] yaml_2.2.0 memoise_1.1.0 reticulate_1.10
## [73] pbapply_1.3-4 rpart_4.1-13 segmented_0.5-3.0
## [76] RSQLite_2.1.1 latticeExtra_0.6-28 stringi_1.2.4
## [79] S4Vectors_0.20.1 foreach_1.4.4 checkmate_1.8.5
## [82] BiocGenerics_0.28.0 caTools_1.17.1.1 bibtex_0.4.2
## [85] Rdpack_0.10-1 SDMTools_1.1-221 rlang_0.3.0.1
## [88] pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.2-6
## [91] bitops_1.0-6 evaluate_0.12 lattice_0.20-38
## [94] ROCR_1.0-7 purrr_0.2.5 bindr_0.1.1
## [97] htmlwidgets_1.3 bit_1.1-14 tidyselect_0.2.5
## [100] plyr_1.8.4 magrittr_1.5 R6_2.3.0
## [103] IRanges_2.16.0 snow_0.4-3 gplots_3.0.1
## [106] Hmisc_4.1-1 DBI_1.0.0 pillar_1.3.1
## [109] foreign_0.8-71 withr_2.1.2 fitdistrplus_1.0-11
## [112] mixtools_1.1.0 RCurl_1.95-4.11 survival_2.43-3
## [115] nnet_7.3-12 tsne_0.1-3 tibble_1.4.2
## [118] crayon_1.3.4 hdf5r_1.0.1 KernSmooth_2.23-15
## [121] rmarkdown_1.11 progress_1.2.0 grid_3.5.1
## [124] data.table_1.11.8 blob_1.1.1 metap_1.0
## [127] digest_0.6.18 diptest_0.75-7 tidyr_0.8.2
## [130] R.utils_2.7.0 stats4_3.5.1 munsell_0.5.0
## [133] viridisLite_0.3.0
print(paste("Seurat ", packageVersion("Seurat")))
=======
## [1] biomaRt_2.36.1 knitr_1.20 bindrcpp_0.2.2 gridExtra_2.3
## [5] dplyr_0.7.6 Seurat_2.3.4 Matrix_1.2-14 cowplot_0.9.3
## [9] ggplot2_3.0.0
##
## loaded via a namespace (and not attached):
## [1] Rtsne_0.13 colorspace_1.3-2 class_7.3-14
## [4] modeltools_0.2-22 ggridges_0.5.0 mclust_5.4.1
## [7] rprojroot_1.3-2 htmlTable_1.12 base64enc_0.1-3
## [10] rstudioapi_0.7 proxy_0.4-22 flexmix_2.3-14
## [13] bit64_0.9-7 AnnotationDbi_1.42.1 mvtnorm_1.0-8
## [16] codetools_0.2-15 splines_3.5.1 R.methodsS3_1.7.1
## [19] robustbase_0.93-1.1 Formula_1.2-3 jsonlite_1.5
## [22] ica_1.0-2 cluster_2.0.7-1 kernlab_0.9-26
## [25] png_0.1-7 R.oo_1.22.0 compiler_3.5.1
## [28] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
## [31] lazyeval_0.2.1 prettyunits_1.0.2 lars_1.2
## [34] acepack_1.4.1 htmltools_0.3.6 tools_3.5.1
## [37] igraph_1.2.1 gtable_0.2.0 glue_1.3.0
## [40] RANN_2.6 reshape2_1.4.3 Rcpp_1.0.0
## [43] Biobase_2.40.0 trimcluster_0.1-2.1 gdata_2.18.0
## [46] ape_5.1 nlme_3.1-137 iterators_1.0.10
## [49] fpc_2.1-11 lmtest_0.9-36 stringr_1.3.1
## [52] irlba_2.3.2 gtools_3.8.1 XML_3.98-1.12
## [55] DEoptimR_1.0-8 MASS_7.3-50 zoo_1.8-3
## [58] scales_0.5.0 hms_0.4.2 doSNOW_1.0.16
## [61] parallel_3.5.1 RColorBrewer_1.1-2 yaml_2.1.19
## [64] memoise_1.1.0 reticulate_1.7 pbapply_1.3-4
## [67] rpart_4.1-13 segmented_0.5-3.0 latticeExtra_0.6-28
## [70] stringi_1.2.4 RSQLite_2.1.1 S4Vectors_0.18.3
## [73] foreach_1.4.4 checkmate_1.8.5 BiocGenerics_0.26.0
## [76] caTools_1.17.1 SDMTools_1.1-221 rlang_0.2.1
## [79] pkgconfig_2.0.2 dtw_1.20-1 prabclus_2.2-6
## [82] bitops_1.0-6 evaluate_0.11 lattice_0.20-35
## [85] ROCR_1.0-7 purrr_0.2.5 bindr_0.1.1
## [88] htmlwidgets_1.2 bit_1.1-14 tidyselect_0.2.4
## [91] plyr_1.8.4 magrittr_1.5 R6_2.3.0
## [94] IRanges_2.14.10 snow_0.4-3 gplots_3.0.1
## [97] Hmisc_4.1-1 DBI_1.0.0 pillar_1.3.0
## [100] foreign_0.8-70 withr_2.1.2 fitdistrplus_1.0-9
## [103] mixtools_1.1.0 survival_2.42-6 RCurl_1.95-4.11
## [106] nnet_7.3-12 tibble_1.4.2 tsne_0.1-3
## [109] crayon_1.3.4 hdf5r_1.0.0 KernSmooth_2.23-15
## [112] rmarkdown_1.10 progress_1.2.0 grid_3.5.1
## [115] data.table_1.11.8 blob_1.1.1 metap_0.9
## [118] digest_0.6.17 diptest_0.75-7 tidyr_0.8.1
## [121] R.utils_2.7.0 stats4_3.5.1 munsell_0.5.0
>>>>>>> 96a75bd8e0458a8ddb96ee845073289da15b496e
## [1] "Seurat 2.3.4"
Load Data
<<<<<<< HEAD
#setwd("~/Desktop/PD_scRNAseq/")
dir.create(file.path("Data"), showWarnings=F)
load("Data/seurat_object_add_HTO_ids.Rdata")
pbmc <- seurat.obj
rm(seurat.obj)
pbmc
=======
#setwd("~/Desktop/PD_scRNAseq/")
dir.create(file.path("Data"), showWarnings=F)
load("Data/seurat_object_add_HTO_ids.Rdata")
pbmc <- seurat.obj
rm(seurat.obj)
pbmc
>>>>>>> 96a75bd8e0458a8ddb96ee845073289da15b496e
## An object of class seurat in project RAJ_13357
## 24914 genes across 22113 samples.
Clean Metadata
Add Metadata
<<<<<<< HEAD
metadata <- read.table("Data/meta.data4.tsv")
kable(head(metadata), caption = "Metadata") %>%
kable_styling(bootstrap_options = kableStyle) %>%
scroll_box(width = "100%", height = "500px")
Metadata
ID
nGene
nUMI
orig.ident
singlet.or.not.binary
percent.mito
res.2
res.1
res.0.6
res.0.3
CellType
barcode
dx
mut
Ethnicity
Gender
Age
AAAGCAAGTTTGTTGG
BIMD0007
784
1780
RAJ_13357
1
0.0314607
3
3
1
0
0
AAAGCAAGTTTGTTGG
PD
PD
White
M
59
TCAGCAATCTTGACGA
BIMD0007
742
1854
RAJ_13357
1
0.0302213
18
1
0
0
0
TCAGCAATCTTGACGA
PD
PD
White
M
59
AGCTCCTTCGCGTAGC
BIMD0007
495
988
RAJ_13357
1
0.0404858
14
7
2
3
3
AGCTCCTTCGCGTAGC
PD
PD
White
M
59
TATTACCCACTCTGTC
BIMD0007
812
1874
RAJ_13357
1
0.0469584
16
12
7
6
6
TATTACCCACTCTGTC
PD
PD
White
M
59
CTCGAGGAGCGATTCT
BIMD0007
863
2260
RAJ_13357
1
0.0212578
1
0
1
0
0
CTCGAGGAGCGATTCT
PD
PD
White
M
59
ATAAGAGCATCAGTCA
BIMD0007
803
2034
RAJ_13357
1
0.0216323
9
8
0
0
0
ATAAGAGCATCAGTCA
PD
PD
White
M
59
# Make AgeGroups
makeAgeGroups <- function(){
dim(metadata)
getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit))
ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
AgeGroupsUniq <- c()
for (i in 1:(length(ageBreaks)-1)){
AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-"))
}
data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age,
breaks = ageBreaks,
right = F,
labels = AgeGroupsUniq,
nclude.lowest=T)]
metadata <- data.frame(metadata)
unique(metadata$AgeGroups)
head(metadata)
dim(metadata)
return(metadata)
}
# metadata <- makeAgeGroups()
pbmc <- AddMetaData(object = pbmc, metadata = metadata)
# Get rid of any NAs (cells that don't match up with the metadata)
cellLimiter <- ifelse(is.null(subsetCells), len(pbmc@cell.names), subsetCells)
pbmc <- FilterCells(object = pbmc, subset.names = "nGene", low.thresholds = 0,
# Subset for testing
cells.use = pbmc@cell.names[0:cellLimiter]
)
pbmc
=======
metadata <- read.table("Data/meta.data4.tsv")
kable(head(metadata), caption = "Metadata") %>%
kable_styling(bootstrap_options = kableStyle) %>%
scroll_box(width = "100%", height = "500px")
## Error in kable_styling(., bootstrap_options = kableStyle): could not find function "kable_styling"
# Make AgeGroups
makeAgeGroups <- function(){
dim(metadata)
getMaxRound <- function(vals=metadata$Age, unit=10)unit*ceiling((max(vals)/unit))
getMinRound <- function(vals=metadata$Age, unit=10)unit*floor((min(vals)/unit))
ageBreaks = c(seq(getMinRound(), getMaxRound(), by = 10), getMaxRound()+10)
AgeGroupsUniq <- c()
for (i in 1:(length(ageBreaks)-1)){
AgeGroupsUniq <- append(AgeGroupsUniq, paste(ageBreaks[i],ageBreaks[i+1], sep="-"))
}
data.table::setDT(metadata,keep.rownames = T,check.names = F)[, AgeGroups := cut(Age,
breaks = ageBreaks,
right = F,
labels = AgeGroupsUniq,
nclude.lowest=T)]
metadata <- data.frame(metadata)
unique(metadata$AgeGroups)
head(metadata)
dim(metadata)
return(metadata)
}
# metadata <- makeAgeGroups()
pbmc <- AddMetaData(object = pbmc, metadata = metadata)
# Get rid of any NAs (cells that don't match up with the metadata)
cellLimiter <- ifelse(is.null(subsetCells), len(pbmc@cell.names), subsetCells)
pbmc <- FilterCells(object = pbmc, subset.names = "nGene", low.thresholds = 0,
# Subset for testing
cells.use = pbmc@cell.names[0:cellLimiter]
)
pbmc
>>>>>>> 96a75bd8e0458a8ddb96ee845073289da15b496e
An object of class seurat in project RAJ_13357 24914 genes across 495 samples.
Filter & Normalize Data
Gene Biotypes
Include only subsets of genes by type. Biotypes from: https://useast.ensembl.org/info/genome/genebuild/biotypes.html
<<<<<<< HEAD
## Seurat::FindGeneTerms() # Enrichr API
## Seurat::MultiModal_CCA() # Integrates data from disparate datasets (CIA version too)
if(!is.null(subsetGenes)){
# If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
if(file_test("-f", "Data/gene_biotypes.csv")){
biotypes <- read.csv("Data/gene_biotypes.csv")
}
else {
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
dataset="hsapiens_gene_ensembl")
ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
listFilters(ensembl)
listAttributes(ensembl)
biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
values=row.names(pbmc@data), mart=ensembl)
write.csv(biotypes, "Data/gene_biotypes.csv", quote=F, row.names=F)
}
# Subset data by creating new Seurat object (annoying but necessary)
geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"]
print(paste(dim(pbmc@raw.data[geneSubset, ])[1],"/", dim(pbmc@raw.data)[1],
"genes are", subsetGenes))
# Add back into pbmc
subset.matrix <- pbmc@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
pbmc_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
orig.ident <- row.names(pbmc@meta.data) # Pull the identities from the original Seurat object as a data.frame
pbmc_sub <- AddMetaData(object = pbmc_sub, metadata = pbmc@meta.data) # Add the idents to the meta.data slot
pbmc_sub <- SetAllIdent(object = pbmc_sub, id = "ident") # Assign identities for the new Seurat object
pbmc <- pbmc_sub
rm(pbmc_sub)
pbmc
}
=======
## Seurat::FindGeneTerms() # Enrichr API
## Seurat::MultiModal_CCA() # Integrates data from disparate datasets (CIA version too)
if(!is.null(subsetGenes)){
# If the gene_biotypes file exists, import csv. Otherwise, get from biomaRt
if(file_test("-f", "Data/gene_biotypes.csv")){
biotypes <- read.csv("Data/gene_biotypes.csv")
}
else {
ensembl <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org",
dataset="hsapiens_gene_ensembl")
ensembl <- useDataset(mart = ensembl, dataset = "hsapiens_gene_ensembl")
listFilters(ensembl)
listAttributes(ensembl)
biotypes <- getBM(attributes=c("hgnc_symbol", "gene_biotype"), filters="hgnc_symbol",
values=row.names(pbmc@data), mart=ensembl)
write.csv(biotypes, "Data/gene_biotypes.csv", quote=F, row.names=F)
}
# Subset data by creating new Seurat object (annoying but necessary)
geneSubset <- biotypes[biotypes$gene_biotype==subsetGenes,"hgnc_symbol"]
print(paste(dim(pbmc@raw.data[geneSubset, ])[1],"/", dim(pbmc@raw.data)[1],
"genes are", subsetGenes))
# Add back into pbmc
subset.matrix <- pbmc@raw.data[geneSubset, ] # Pull the raw expression matrix from the original Seurat object containing only the genes of interest
pbmc_sub <- CreateSeuratObject(subset.matrix) # Create a new Seurat object with just the genes of interest
orig.ident <- row.names(pbmc@meta.data) # Pull the identities from the original Seurat object as a data.frame
pbmc_sub <- AddMetaData(object = pbmc_sub, metadata = pbmc@meta.data) # Add the idents to the meta.data slot
pbmc_sub <- SetAllIdent(object = pbmc_sub, id = "ident") # Assign identities for the new Seurat object
pbmc <- pbmc_sub
rm(pbmc_sub)
pbmc
}
>>>>>>> 96a75bd8e0458a8ddb96ee845073289da15b496e
## [1] "14827 / 24914 genes are protein_coding"
## An object of class seurat in project SeuratProject
## 14827 genes across 27863 samples.
Filter Cells, Gene Variance, & Normalize
Filter by cells, normalize , filter by gene variability.
** Important!**: Specify do.par = T, and num.cores = nCores in ‘ScaleData’ to use all available cores.
<<<<<<< HEAD
pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"),
low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize",
scale.factor = 10000)
# Store the top most variable genes in @var.genes
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

# IMPORTANT!: Must set do.par=T and num.cors = n for large datasets being processed on computing clusters
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"), do.par = T, num.cores = nCores)
## Regressing out: nUMI, percent.mito
## Warning in RegressOutResid(object = object, vars.to.regress =
## vars.to.regress, : num.cores set greater than number of available cores(4).
## Setting num.cores to 3.
##
## Time Elapsed: 6.84275984764099 secs
=======
pbmc <- FilterCells(object = pbmc, subset.names = c("nGene", "percent.mito"),
low.thresholds = c(200, -Inf), high.thresholds = c(2500, 0.05))
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize",
scale.factor = 10000)
# Store the top most variable genes in @var.genes
pbmc <- FindVariableGenes(object = pbmc, mean.function = ExpMean, dispersion.function = LogVMR,
x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5)

# IMPORTANT!: Must set do.par=T and num.cors = n for large datasets being processed on computing clusters
pbmc <- ScaleData(object = pbmc, vars.to.regress = c("nUMI", "percent.mito"), do.par = T, num.cores = nCores)
## Regressing out: nUMI, percent.mito
##
## Time Elapsed: 52.0813755989075 secs
>>>>>>> 96a75bd8e0458a8ddb96ee845073289da15b496e
## Scaling data matrix
VlnPlot(object = pbmc, features.plot = c("nGene", "nUMI", "percent.mito"),nCol = 3) # par(mfrow = c(1, 2))
gp1 <- GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "percent.mito", pch.use=20,
do.hover=T, data.hover = "mut")gp1gp2 <- GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "nGene", pch.use=20,
do.hover=T, data.hover = "mut")gp2# par(mfrow = c(1, 2))
gp1 <- GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "percent.mito", pch.use=20,
do.hover=T, data.hover = "mut")gp2 <- GenePlot(object = pbmc, gene1 = "nUMI", gene2 = "nGene", pch.use=20,
do.hover=T, data.hover = "mut")ProjectPCA scores each gene in the dataset (including genes not included in the PCA) based on their correlation with the calculated components. Though we don’t use this further here, it can be used to identify markers that are strongly correlated with cellular heterogeneity, but may not have passed through variable gene selection. The results of the projected PCA can be explored by setting use.full=T in the functions above
# Run PCA with only the top most variables genes
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print=F)
#, pcs.print = 1:5, genes.print = 5VizPCA(object = pbmc, pcs.use = 1:2)PCAPlot(object = pbmc, dim.1 = 1, dim.2 = 2)pbmc <- ProjectPCA(object = pbmc, do.print=F)
## PCA Heatmap: PC1
PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced=T, label.columns=F)## PCA Heatmap: PC1-PCn
PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced=T,
label.columns=F, use.full=F)# Run PCA with only the top most variables genes
pbmc <- RunPCA(object = pbmc, pc.genes = pbmc@var.genes, do.print=F)
#, pcs.print = 1:5, genes.print = 5pbmc <- ProjectPCA(object = pbmc, do.print=F)
## PCA Heatmap: PC1
PCHeatmap(object = pbmc, pc.use = 1, cells.use = 500, do.balanced=T, label.columns=F)## PCA Heatmap: PC1-PCn
PCHeatmap(object = pbmc, pc.use = 1:12, cells.use = 500, do.balanced=T,
label.columns=F, use.full=F)Determine statistically significant PCs for further analysis. NOTE: This process can take a long time for big datasets, comment out for expediency. More approximate techniques such as those implemented in PCElbowPlot() can be used to reduce computation time
<<<<<<< HEAD#pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = pbmc)#pbmc <- JackStraw(object = pbmc, num.replicate = 100, display.progress = FALSE)
PCElbowPlot(object = pbmc)We first construct a KNN graph based on the euclidean distance in PCA space, and refine the edge weights between any two cells based on the shared overlap in their local neighborhoods (Jaccard similarity). To cluster the cells, we apply modularity optimization techniques such as the Louvain algorithm (default) or SLM [SLM, Blondel et al., Journal of Statistical Mechanics], to iteratively group cells together, with the goal of optimizing the standard modularity function.
<<<<<<< HEADOn Resolution
The FindClusters function implements the procedure, and contains a resolution parameter that sets the ‘granularity’ of the downstream clustering, with increased values leading to a greater number of clusters. We find that setting this parameter between 0.6-1.2 typically returns good results for single cell datasets of around 3K cells. Optimal resolution often increases for larger datasets. The clusters are saved in the object@ident slot.
# TRY DIFFERENT RESOLUTIONS
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = resolution, print.output = 0, save.SNN = T)
PrintFindClustersParams(object = pbmc) ## Parameters used in latest FindClusters calculation run on: 2019-01-03 14:42:52
## =============================================================================
## Resolution: 0.6
=======
# TRY DIFFERENT RESOLUTIONS
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = resolution, print.output = 0, save.SNN = T)
PrintFindClustersParams(object = pbmc)
## Parameters used in latest FindClusters calculation run on: 2019-01-03 17:25:07
## =============================================================================
## Resolution: 1
>>>>>>> 96a75bd8e0458a8ddb96ee845073289da15b496e
## -----------------------------------------------------------------------------
## Modularity Function Algorithm n.start n.iter
## 1 1 100 10
## -----------------------------------------------------------------------------
## Reduction used k.param prune.SNN
## pca 30 0.0667
## -----------------------------------------------------------------------------
## Dims used in calculation
## =============================================================================
## 1 2 3 4 5 6 7 8 9 10
As input to the tSNE, we suggest using the same PCs as input to the clustering analysis, although computing the tSNE based on scaled gene expression is also supported using the genes.use argument.
** Important!**: Specify num_threads=0 in ‘RunTSNE’ to use all available cores.
<<<<<<< HEADlabSize <- 6
#pbmc <- StashIdent(object = pbmc, save.name = "pre_clustering")
#pbmc <- SetAllIdent(object = pbmc, id = "pre_clustering")
pbmc <- RunTSNE(object=pbmc, reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
tsne.method = "Rtsne", num_threads=0) # num_threads
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = pbmc, do.label=T, label.size = labSize) saveRDS(pbmc, file = "Data/cd14-processed.rds") tSNE_metadata_plot <- function(var){
print(paste("t-SNE Metadata plot for ", var))
# Metadata plot
p1 <- TSNEPlot(pbmc, do.return = T, pt.size = 0.5, group.by = var, do.label = T,
dark.theme=F, plot.title=paste("Color by ",var), vector.friendly=T)
# t-SNE clusters plot
p2 <- TSNEPlot(pbmc, do.label = T, do.return = T, pt.size = 0.5, plot.title=paste("Color by Unsupervised Clusters"), vector.friendly=T)
print(plot_grid(p1, p2))
}
# metaVars <- c("CellType","dx","mut","Gender","Age")
#
# for (var in metaVars){
# print(paste("t-SNE Metadata plot for ",var))
# # Metadata plot
# p1 <- TSNEPlot(pbmc, do.return = T, pt.size = 0.5, group.by = var, do.label = T,
# dark.theme=F, plot.title=paste("Color by ",var))
# # t-SNE clusters plot
# p2 <- TSNEPlot(pbmc, do.label = T, do.return = T, pt.size = 0.5, plot.title=paste("Color by t-SNE clusters"))
# print(plot_grid(p1, p2))
# } tSNE_metadata_plot("CellType") ## [1] "t-SNE Metadata plot for CellType"
tSNE_metadata_plot("dx") ## [1] "t-SNE Metadata plot for dx"
tSNE_metadata_plot("mut") ## [1] "t-SNE Metadata plot for mut"
tSNE_metadata_plot("Gender") ## [1] "t-SNE Metadata plot for Gender"
tSNE_metadata_plot("Age") ## [1] "t-SNE Metadata plot for Age"
labSize <- 6
#pbmc <- StashIdent(object = pbmc, save.name = "pre_clustering")
#pbmc <- SetAllIdent(object = pbmc, id = "pre_clustering")
pbmc <- RunTSNE(object=pbmc, reduction.use = "pca", dims.use = 1:10, do.fast = TRUE,
tsne.method = "Rtsne", num_threads=0) # num_threads
# note that you can set do.label=T to help label individual clusters
TSNEPlot(object = pbmc, do.label=T, label.size = labSize) tSNE_metadata_plot <- function(var){
print(paste("t-SNE Metadata plot for ", var))
# Metadata plot
p1 <- TSNEPlot(pbmc, do.return = T, pt.size = 0.5, group.by = var, do.label = T,
dark.theme=F, plot.title=paste("Color by ",var), vector.friendly=T)
# t-SNE clusters plot
p2 <- TSNEPlot(pbmc, do.label = T, do.return = T, pt.size = 0.5, plot.title=paste("Color by Unsupervised Clusters"), vector.friendly=T)
print(plot_grid(p1, p2))
}
# metaVars <- c("CellType","dx","mut","Gender","Age")
#
# for (var in metaVars){
# print(paste("t-SNE Metadata plot for ",var))
# # Metadata plot
# p1 <- TSNEPlot(pbmc, do.return = T, pt.size = 0.5, group.by = var, do.label = T,
# dark.theme=F, plot.title=paste("Color by ",var))
# # t-SNE clusters plot
# p2 <- TSNEPlot(pbmc, do.label = T, do.return = T, pt.size = 0.5, plot.title=paste("Color by t-SNE clusters"))
# print(plot_grid(p1, p2))
# } ## [1] "t-SNE Metadata plot for Age"
Seurat has several tests for differential expression which can be set with the test.use parameter (see the DE vignette for details). For example, the ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect).
### Biomarkers: One Cluster vs. Specific Clusters
# cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = c(2),
# min.pct = 0.25)
# print(x = head(x = cluster5.markers, n = 3))
### Biomarkers: One Cluster vs. All Other Clusters
# find all markers of a given cluster
# MUST run FindClusters() first
# cluster0.markers <- FindMarkers(object = pbmc, ident.1 = 0, min.pct = 0.25)
# print(x = head(x = cluster0.markers, n = 3))
### Biomarkers: All Clusters vs. All Other Clusters ***
# find markers for every cluster compared to all remaining cells, report
# only the positive ones
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
topBiomarkers <- pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
kable(topBiomarkers) %>% kable_styling(bootstrap_options = kableStyle)| p_val | avg_logFC | pct.1 | pct.2 | p_val_adj | cluster | gene |
|---|---|---|---|---|---|---|
| 0 | 0.9939484 | 1.000 | 1.000 | 0 | 0 | S100A8 |
| 0 | 1.0147286 | 0.938 | 0.692 | 0 | 0 | S100A12 |
| 0 | 1.3719797 | 0.727 | 0.168 | 0 | 1 | FCGR3A |
| 0 | 1.3581437 | 0.388 | 0.086 | 0 | 1 | C1QA |
| 0 | 2.1004198 | 0.784 | 0.069 | 0 | 2 | FCER1A |
| 0 | 1.4090180 | 0.664 | 0.140 | 0 | 2 | CLEC10A |
getTopBiomarker <- function(pbmc.markers, clusterID, topN=1){
df <- subset(pbmc.markers, p_val_adj<0.05 & cluster==as.character(clusterID)) %>% arrange(desc(avg_logFC))
top_pct_markers <- df[1:topN,"gene"]
return(top_pct_markers)
}
# clust1_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=1, topN=2)
# clust2_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=2, topN=2)
### Plot biomarkers
plotBiomarkers <- function(pbmc, biomarkers, cluster){
biomarkerPlots <- list()
for (marker in biomarkers){
#print(marker)
p <- VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T)
biomarkerPlots[[marker]] <- p + ggplot2::aes(alpha=.7)
}
combinedPlot <- do.call(grid.arrange, c(biomarkerPlots, list(ncol=2, top=paste("Top DEG Biomarkers for Cluster",cluster))) )
return(combinedPlot)
}
# Plot top 2 biomarker genes for each
for (clust in unique(pbmc.markers$cluster)){
cat('\n')
cat("### Cluster ",clust,"\n")
biomarkers <- getTopBiomarker(pbmc.markers, clusterID=clust, topN=2)
plotBiomarkers(pbmc, biomarkers, clust)
cat('\n')
} NA
### Biomarkers: One Cluster vs. Specific Clusters
# cluster5.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = c(2),
# min.pct = 0.25)
# print(x = head(x = cluster5.markers, n = 3))
### Biomarkers: One Cluster vs. All Other Clusters
# find all markers of a given cluster
# MUST run FindClusters() first
# cluster0.markers <- FindMarkers(object = pbmc, ident.1 = 0, min.pct = 0.25)
# print(x = head(x = cluster0.markers, n = 3))
### Biomarkers: All Clusters vs. All Other Clusters ***
# find markers for every cluster compared to all remaining cells, report
# only the positive ones
pbmc.markers <- FindAllMarkers(object = pbmc, only.pos = TRUE, min.pct = 0.25, thresh.use = 0.25)
topBiomarkers <- pbmc.markers %>% group_by(cluster) %>% top_n(2, avg_logFC)
kable(topBiomarkers) %>% kable_styling(bootstrap_options = kableStyle)## Error in kable_styling(., bootstrap_options = kableStyle): could not find function "kable_styling"
getTopBiomarker <- function(pbmc.markers, clusterID, topN=1){
df <- subset(pbmc.markers, p_val_adj<0.05 & cluster==as.character(clusterID)) %>% arrange(desc(avg_logFC))
top_pct_markers <- df[1:topN,"gene"]
return(top_pct_markers)
}
# clust1_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=1, topN=2)
# clust2_biomarkers <- getTopBiomarker(pbmc.markers, clusterID=2, topN=2)
### Plot biomarkers
plotBiomarkers <- function(pbmc, biomarkers, cluster){
biomarkerPlots <- list()
for (marker in biomarkers){
#print(marker)
p <- VlnPlot(object = pbmc, features.plot = c(marker), y.log=T, return.plotlist=T)
biomarkerPlots[[marker]] <- p + ggplot2::aes(alpha=.7)
}
combinedPlot <- do.call(grid.arrange, c(biomarkerPlots, list(ncol=2, top=paste("Top DEG Biomarkers for Cluster",cluster))) )
return(combinedPlot)
}
# Plot top 2 biomarker genes for each
for (clust in unique(pbmc.markers$cluster)){
cat('\n')
cat("### Cluster ",clust,"\n")
biomarkers <- getTopBiomarker(pbmc.markers, clusterID=clust, topN=2)
plotBiomarkers(pbmc, biomarkers, clust)
cat('\n')
} ## Error in if (all(unlist(x = strsplit(x = my.var, split = "[0-9]+")) == : missing value where TRUE/FALSE needed
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
nCols = round( length(unique(top1$cluster)) / 3 )
figHeight <- nCols*7FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("grey", "blue"),
reduction.use = "tsne", nCol = nCols)top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
# setting slim.col.label to TRUE will print just the cluster IDS instead of
# every cell name
DoHeatmap(object = pbmc, genes.use = top10$gene, slim.col.label=T, remove.key=T)RidgePlot(pbmc, features.plot = top1$gene, nCol = nCols, do.sort = F)## Picking joint bandwidth of 0.298
## Picking joint bandwidth of 0.117
## Picking joint bandwidth of 0.12
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
nCols = round( length(unique(top1$cluster)) / 3 )
figHeight <- nCols*7FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("grey", "blue"),
reduction.use = "tsne", nCol = nCols)top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(10, avg_logFC)
# setting slim.col.label to TRUE will print just the cluster IDS instead of
# every cell name
DoHeatmap(object = pbmc, genes.use = top10$gene, slim.col.label=T, remove.key=T)## Picking joint bandwidth of 0.295
## Picking joint bandwidth of 0.101
## Picking joint bandwidth of 0.0874
## Picking joint bandwidth of 0.325
current.cluster.ids <- unique(pbmc.markers$cluster) #c(0, 1, 2, 3, 4, 5, 6, 7)
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
new.cluster.ids <- top1$gene #c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object=pbmc, do.label=T, pt.size=0.5) current.cluster.ids <- unique(pbmc.markers$cluster) #c(0, 1, 2, 3, 4, 5, 6, 7)
top1 <- pbmc.markers %>% group_by(cluster) %>% top_n(1, avg_logFC)
new.cluster.ids <- top1$gene #c("CD4 T cells", "CD14+ Monocytes", "B cells", "CD8 T cells", "FCGR3A+ Monocytes", "NK cells", "Dendritic cells", "Megakaryocytes")
pbmc@ident <- plyr::mapvalues(x = pbmc@ident, from = current.cluster.ids, to = new.cluster.ids)
TSNEPlot(object=pbmc, do.label=T, pt.size=0.5) Further subdivisions within cell types.
If you perturb some of our parameter choices above (for example, setting resolution=0.8 or changing the number of PCs), you might see the CD4 T cells subdivide into two groups. You can explore this subdivision to find markers separating the two T cell subsets. However, before reclustering (which will overwrite object@ident), we can stash our renamed identities to be easily recovered later.
# First lets stash our identities for later
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
# Note that if you set save.snn=T above, you don't need to recalculate the
# SNN, and can simply put: pbmc <- FindClusters(pbmc,resolution = 0.8)
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = 0.8, print.output = FALSE)## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
# Demonstration of how to plot two tSNE plots side by side, and how to color
# points based on different criteria
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6",
no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot_grid(plot1, plot2)# Find discriminating markers
tcell.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = 1)
# Most of the markers tend to be expressed in C1 (i.e. S100A4). However, we
# can see that CCR7 is upregulated in C0, strongly indicating that we can
# differentiate memory from naive CD4 cells. cols.use demarcates the color
# palette from low to high expression
FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("green", "blue"))pbmc <- SetAllIdent(object = pbmc, id = "ClusterNames_0.6") saveRDS(pbmc, file = "Data/cd14-processed.rds") # First lets stash our identities for later
pbmc <- StashIdent(object = pbmc, save.name = "ClusterNames_0.6")
# Note that if you set save.snn=T above, you don't need to recalculate the
# SNN, and can simply put: pbmc <- FindClusters(pbmc,resolution = 0.8)
pbmc <- FindClusters(object = pbmc, reduction.type = "pca", dims.use = 1:10,
resolution = 0.8, print.output = FALSE)## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
## Warning in BuildSNN(object = object, genes.use = genes.use, reduction.type
## = reduction.type, : Build parameters exactly match those of already
## computed and stored SNN. To force recalculation, set force.recalc to TRUE.
# Demonstration of how to plot two tSNE plots side by side, and how to color
# points based on different criteria
plot1 <- TSNEPlot(object = pbmc, do.return = TRUE, no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot2 <- TSNEPlot(object = pbmc, do.return = TRUE, group.by = "ClusterNames_0.6",
no.legend = TRUE, do.label = TRUE, label.size=labSize)
plot_grid(plot1, plot2)# Find discriminating markers
tcell.markers <- FindMarkers(object = pbmc, ident.1 = 0, ident.2 = 1)
# Most of the markers tend to be expressed in C1 (i.e. S100A4). However, we
# can see that CCR7 is upregulated in C0, strongly indicating that we can
# differentiate memory from naive CD4 cells. cols.use demarcates the color
# palette from low to high expression
FeaturePlot(object = pbmc, features.plot = top1$gene, cols.use = c("green", "blue"))